Import Libraries#
We are importing different libraries for the data processing and analysis
import pandas as pd
import geopandas as gpd
import datetime
import jenkspy
import seaborn as sns
import numpy as np
from shapely.geometry import Point, LineString, Polygon, MultiPolygon
import matplotlib.pyplot as plt
Reading and Exploring Data#
The dataset was provided with the task as a csv file with documentation describing data in different columns. It contains data as output from LiDAR-based Multiple Object Tracking system. In the following steps we will explore the dataset.
Reading dataset and getting general info about data in different columns, showing the head part of the dataframe:
data = pd.read_csv('full_intersection_15.csv', sep=";")
data.info()
data.head()
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In[2], line 1
----> 1 data = pd.read_csv('full_intersection_15.csv', sep=";")
2 data.info()
3 data.head()
File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pandas/util/_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs)
209 else:
210 kwargs[new_arg_name] = new_arg_value
--> 211 return func(*args, **kwargs)
File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pandas/util/_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
325 if len(args) > num_allow_args:
326 warnings.warn(
327 msg.format(arguments=_format_argument_list(allow_args)),
328 FutureWarning,
329 stacklevel=find_stack_level(),
330 )
--> 331 return func(*args, **kwargs)
File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py:950, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
935 kwds_defaults = _refine_defaults_read(
936 dialect,
937 delimiter,
(...)
946 defaults={"delimiter": ","},
947 )
948 kwds.update(kwds_defaults)
--> 950 return _read(filepath_or_buffer, kwds)
File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py:605, in _read(filepath_or_buffer, kwds)
602 _validate_names(kwds.get("names", None))
604 # Create the parser.
--> 605 parser = TextFileReader(filepath_or_buffer, **kwds)
607 if chunksize or iterator:
608 return parser
File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1442, in TextFileReader.__init__(self, f, engine, **kwds)
1439 self.options["has_index_names"] = kwds["has_index_names"]
1441 self.handles: IOHandles | None = None
-> 1442 self._engine = self._make_engine(f, self.engine)
File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1735, in TextFileReader._make_engine(self, f, engine)
1733 if "b" not in mode:
1734 mode += "b"
-> 1735 self.handles = get_handle(
1736 f,
1737 mode,
1738 encoding=self.options.get("encoding", None),
1739 compression=self.options.get("compression", None),
1740 memory_map=self.options.get("memory_map", False),
1741 is_text=is_text,
1742 errors=self.options.get("encoding_errors", "strict"),
1743 storage_options=self.options.get("storage_options", None),
1744 )
1745 assert self.handles is not None
1746 f = self.handles.handle
File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pandas/io/common.py:856, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
851 elif isinstance(handle, str):
852 # Check whether the filename is to be opened in binary mode.
853 # Binary mode does not support 'encoding' and 'newline'.
854 if ioargs.encoding and "b" not in ioargs.mode:
855 # Encoding
--> 856 handle = open(
857 handle,
858 ioargs.mode,
859 encoding=ioargs.encoding,
860 errors=errors,
861 newline="",
862 )
863 else:
864 # Binary mode
865 handle = open(handle, ioargs.mode)
FileNotFoundError: [Errno 2] No such file or directory: 'full_intersection_15.csv'
The data has 2563480 entires (trajectory points) with different characteristics such as object_id, timestamp (in ms), longitude and latitude, heading, height, width, length of the vehicle, velocity, tracking status and object_type
Exploring the data#
Area of Research#
The data was collected in Salzburg, Austria at the intersection of Sterneckstraße, Linzer Bunderrtraße, Ignaz-Härtl-Straße and Fürbergstraße
buffer = gpd.GeoSeries([Point(13.064405, 47.810136)], crs="EPSG:4326").to_crs(epsg=32633).buffer(200)
buffer_gdf = gpd.GeoDataFrame(geometry=buffer, crs =32633 )
buffer_gdf.explore(tiles='Esri.WorldImagery')
Time#
minData = datetime.datetime.fromtimestamp(int(min(data['timestamp'])/1000))
maxData = datetime.datetime.fromtimestamp(int(max(data['timestamp'])/1000))
print("It was collected from: %s, To: %s" % (minData, maxData))
It was collected from: 2024-04-02 16:00:00, To: 2024-04-02 16:59:59
Total observations#
print("Total observations in a raw dataset: %i" % (data.size))
print("Total unique objects: %i" % (data['object_id'].nunique()))
Total observations in a raw dataset: 28198280
Total unique objects: 34580
Tracking Status#
On of the main points characteristics is tracking status. Let’s see the structure for points based on this column:
data_by_status = data.groupby("tracking_status").size()
plt.figure(figsize=(5,5))
data_by_status.plot.pie(autopct="%.2f", colors=sns.color_palette("Set2"))
plt.title('Distribution by Tracking Status')
plt.show()
More than 82% of all points have “TRACKING” status which shows “stable tracking”. Other categories are “INVALIDATING”, “DRIFTING” and “VALIDATING” are not considered as ‘stable tracking’ and we will filter this data and won’t consider in the following analysis
Filterting Objects Based on Tracking Status and Calculating New Number of Observations#
data_cleaned = data[data['tracking_status']=='TRACKING']
print("Observations Total (cleaned data): %i. Share of raw: %i percent" % (data_cleaned.size, (data_cleaned.size/data.size)*100))
print("Total unique objects (cleaned data): %i. Share of raw: %i percent" % (data_cleaned['object_id'].nunique(), (data_cleaned['object_id'].nunique()/data['object_id'].nunique())*100))
Observations Total (cleaned data): 23349700. Share of raw: 82 percent
Total unique objects (cleaned data): 11025. Share of raw: 31 percent
Objects Types#
The data was collected for different object types: cars, two-wheelers, and pedestrians. Let’s explore the share of each object type in the data.
data_by_object = data_cleaned.groupby("object_type").size()
plt.figure(figsize=(5,5))
data_by_object.plot.pie(autopct="%.2f", colors=sns.color_palette("Set2"))
# Set title and show plot
plt.title('Distribution by Object Type')
plt.show()
More than 58% are cars, around 21% are pedestrians, and 4% are two-wheelers. Additionally, 15.78% are classified as “MISC,” which means that the objects are not considered part of any of the three main classes. We will filter out these objects and, in the following analysis, consider only the three main types.
Filterting Objects Based on Object Type and Calculating New Number of Observations#
data_cleaned = data_cleaned[data_cleaned['object_type'] != 'MISC']
Data Processing#
In the following section, we will process the data to enhance opportunities for further analysis. We will divide the data into 5-minute intervals, create a GeoDataFrame to work with this dataset as spatial data, create lines (directions) for each object based on point data, and calculate various statistics for each object.
Dividing data into classes based on 5 minutes interval#
We are dividing our dataset into 5-minutes interval for the final estimations on the data quality assesment
min_timestamp = data_cleaned['timestamp'].min()
milliseconds = pd.Timedelta(minutes=5).total_seconds() * 1000
data_cleaned['time_class'] = ((data_cleaned['timestamp'] - min_timestamp) / milliseconds).astype(int)
data_cleaned.head()
| object_id | timestamp | heading | height | width | length | v | tracking_status | object_type | lon | lat | time_class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 152997118 | 1712062811083 | 132.072 | 0.735 | 1.942 | 4.387 | 0.03 | TRACKING | CAR | 13.064405 | 47.810136 | 0 |
| 1 | 152997118 | 1712062811183 | 132.286 | 0.731 | 1.861 | 4.320 | 0.04 | TRACKING | CAR | 13.064405 | 47.810136 | 0 |
| 2 | 152997118 | 1712062811283 | 132.229 | 0.739 | 1.955 | 4.395 | 0.03 | TRACKING | CAR | 13.064405 | 47.810136 | 0 |
| 3 | 152997118 | 1712062811386 | 131.980 | 0.729 | 1.942 | 4.367 | 0.04 | TRACKING | CAR | 13.064405 | 47.810135 | 0 |
| 4 | 152997118 | 1712062811485 | 132.020 | 0.737 | 1.875 | 4.328 | 0.02 | TRACKING | CAR | 13.064405 | 47.810136 | 0 |
Creating GeoData#
Based on coordinates provided in a csv file, we are creating points as spatial data - geoDataFrame
points_gdf = gpd.GeoDataFrame(data_cleaned, geometry=gpd.points_from_xy(data_cleaned['lon'], data_cleaned['lat']), crs=4326)
points_gdf.head()
| object_id | timestamp | heading | height | width | length | v | tracking_status | object_type | lon | lat | time_class | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 152997118 | 1712062811083 | 132.072 | 0.735 | 1.942 | 4.387 | 0.03 | TRACKING | CAR | 13.064405 | 47.810136 | 0 | POINT (13.06440 47.81014) |
| 1 | 152997118 | 1712062811183 | 132.286 | 0.731 | 1.861 | 4.320 | 0.04 | TRACKING | CAR | 13.064405 | 47.810136 | 0 | POINT (13.06441 47.81014) |
| 2 | 152997118 | 1712062811283 | 132.229 | 0.739 | 1.955 | 4.395 | 0.03 | TRACKING | CAR | 13.064405 | 47.810136 | 0 | POINT (13.06440 47.81014) |
| 3 | 152997118 | 1712062811386 | 131.980 | 0.729 | 1.942 | 4.367 | 0.04 | TRACKING | CAR | 13.064405 | 47.810135 | 0 | POINT (13.06441 47.81014) |
| 4 | 152997118 | 1712062811485 | 132.020 | 0.737 | 1.875 | 4.328 | 0.02 | TRACKING | CAR | 13.064405 | 47.810136 | 0 | POINT (13.06441 47.81014) |
Calcualte the distance between the first and the last point#
As one of the characteristics for the analysis we are caluclating the distance between the first and the last point for each object
points_gdf = points_gdf.to_crs(points_gdf.estimate_utm_crs())
# Function to calculate distance between first and last points
def calculate_distance(group):
if len(group) > 1:
first_point = group.head(1)['geometry'].iloc[0]
last_point = group.tail(1)['geometry'].iloc[0]
return first_point.distance(last_point)
else:
return None
# Group by object_id and calculate distances
distances = points_gdf.groupby('object_id').apply(calculate_distance).reset_index(name='distance_first_last')
# Merge distances back into the original GeoDataFrame
points_gdf = points_gdf.merge(distances, on='object_id', how='left')
points_gdf = points_gdf.to_crs(epsg=4326)
points_gdf.head()
| object_id | timestamp | heading | height | width | length | v | tracking_status | object_type | lon | lat | time_class | geometry | distance_first_last | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 152997118 | 1712062811083 | 132.072 | 0.735 | 1.942 | 4.387 | 0.03 | TRACKING | CAR | 13.064405 | 47.810136 | 0 | POINT (13.06440 47.81014) | 0.165151 |
| 1 | 152997118 | 1712062811183 | 132.286 | 0.731 | 1.861 | 4.320 | 0.04 | TRACKING | CAR | 13.064405 | 47.810136 | 0 | POINT (13.06441 47.81014) | 0.165151 |
| 2 | 152997118 | 1712062811283 | 132.229 | 0.739 | 1.955 | 4.395 | 0.03 | TRACKING | CAR | 13.064405 | 47.810136 | 0 | POINT (13.06440 47.81014) | 0.165151 |
| 3 | 152997118 | 1712062811386 | 131.980 | 0.729 | 1.942 | 4.367 | 0.04 | TRACKING | CAR | 13.064405 | 47.810135 | 0 | POINT (13.06441 47.81014) | 0.165151 |
| 4 | 152997118 | 1712062811485 | 132.020 | 0.737 | 1.875 | 4.328 | 0.02 | TRACKING | CAR | 13.064405 | 47.810136 | 0 | POINT (13.06441 47.81014) | 0.165151 |
Calcualte average heading and velocity change#
As one of the characteristics for the analysis we are caluclating average heading and velocity change for each object to explore possible anomalies based on this parameters
def calculate_average_change(df, value_field):
change_field = f'{value_field}_change'
avg_change_field = f'avg_{value_field}_change'
# Calculate heading change between consecutive points
df[change_field] = df.groupby('object_id')[value_field].diff().fillna(0)
# Calculate average heading change per object
avg_change = df.groupby('object_id')[change_field].mean()
# Update the original DataFrame with average heading change
df = df.merge(avg_change.rename(avg_change_field), on='object_id', how='left')
return df
# Calculate and update average heading change for each object in the DataFrame
points_gdf = calculate_average_change(points_gdf, 'heading')
points_gdf = calculate_average_change(points_gdf, 'v')
Creating Lines from Point Observations#
Creating lines from point observation to see the actual path for each object
# Group by 'object_id' and create LineString geometries
def create_linestring(group):
if len(group) > 1:
return LineString(group)
return None
# Group by 'object_id' and create LineString geometries
lines = points_gdf.groupby('object_id')['geometry'].apply(lambda x: create_linestring(x.tolist()))
# Create a new GeoDataFrame for the lines
lines_gdf = gpd.GeoDataFrame(lines, geometry='geometry', crs=4326)
lines_gdf = lines_gdf.reset_index().rename(columns={'index': 'object_id'})
lines_gdf = lines_gdf[lines_gdf['geometry'].notnull()]
Calculate Statistics for each line#
Calculating different statistics for each line such as: overall time of movement, average heading, velocity, width, height and length
# Calculate the time difference (max - min timestamp) for each object_id
time_diff = points_gdf.groupby('object_id')['timestamp'].apply(lambda x: (x.max() - x.min()))
# Calculate average heading, average velocity, average width, average length for each object_id
avg_heading = points_gdf.groupby('object_id')['heading'].median()
avg_v = points_gdf.groupby('object_id')['v'].median()
avg_width = points_gdf.groupby('object_id')['width'].median()
avg_length = points_gdf.groupby('object_id')['length'].median()
avg_height = points_gdf.groupby('object_id')['height'].median()
# Combine the calculated statistics into a new DataFrame
statistics_df = pd.DataFrame({'total_time': time_diff, 'avg_heading': avg_heading, 'avg_v': avg_v, 'avg_width': avg_width, 'avg_length': avg_length, 'avg_height': avg_height})
Adding important Object characteristics from an initial DataFrame
# Add object_type column to the new DataFrame
statistics_df['object_type'] = points_gdf.groupby('object_id')['object_type'].first().values
# Add time_class column to the new DataFrame
statistics_df['time_class'] = points_gdf.groupby('object_id')['time_class'].first().values
# Add average heading change column to the new DataFrame
statistics_df['avg_heading_change'] = points_gdf.groupby('object_id')['avg_heading_change'].first().values
# Add average velocity change column to the new DataFrame
statistics_df['avg_v_change'] = points_gdf.groupby('object_id')['avg_v_change'].first().values
# Add distance between first and last points to the new DataFrame
statistics_df['distance_first_last'] = points_gdf.groupby('object_id')['distance_first_last'].first().values
Сalculate whether the type of the same object has changed
# Add a column with all unique object types for each object_id
unique_object_types = points_gdf.groupby('object_id')['object_type'].apply(lambda x: ', '.join(x.unique()))
statistics_df['unique_object_types'] = unique_object_types
# Add columns for each object_type and its occurrences
object_type_counts = points_gdf.groupby(['object_id', 'object_type']).size().unstack(fill_value=0)
statistics_df = statistics_df.join(object_type_counts, on='object_id')
# Calculate the percentage of each object type for each object_id
object_type_percentages = object_type_counts.div(object_type_counts.sum(axis=1), axis=0) * 100
# Determine the object type with the highest percentage for each object_id
top_object_type = object_type_percentages.idxmax(axis=1)
statistics_df['object_type'] = top_object_type
# Calculate the percentage of the top object_type for each object_id
top_object_type_percentage = object_type_percentages.max(axis=1)
statistics_df['top_object_type_percentage'] = top_object_type_percentage
# Reset index to have object_id as a column instead of index
statistics_df.reset_index(inplace=True)
Merging Statistics to GeoDataFrames with lines for each object
lines_gdf_stat = lines_gdf.merge(statistics_df, on='object_id', how='left')
Calculating length for each line
#Calcualte length of each line
lines_gdf_stat = lines_gdf_stat.to_crs(lines_gdf.estimate_utm_crs())
lines_gdf_stat['route_length'] = lines_gdf_stat['geometry'].length
lines_gdf_stat['total_time_s'] = lines_gdf_stat['total_time']/1000
lines_gdf_stat['direct_real_d_ratio'] = lines_gdf_stat['route_length']/lines_gdf_stat['distance_first_last']
lines_gdf_stat = lines_gdf_stat.to_crs(epsg=4326)
lines_gdf_stat.head()
| object_id | geometry | total_time | avg_heading | avg_v | avg_width | avg_length | avg_height | object_type | time_class | ... | avg_v_change | distance_first_last | unique_object_types | CAR | CYCLIST | PEDESTRIAN | top_object_type_percentage | route_length | total_time_s | direct_real_d_ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 152997118 | LINESTRING (13.06440 47.81014, 13.06441 47.810... | 286302 | 133.7800 | 0.03 | 1.869 | 4.3870 | 1.151 | CAR | 0 | ... | 0.000004 | 0.165151 | CAR | 2748 | 0 | 0 | 100.0 | 102.985753 | 286.302 | 623.585910 |
| 1 | 152997181 | LINESTRING (13.06399 47.81006, 13.06399 47.810... | 138700 | 148.0520 | 0.77 | 0.559 | 0.5585 | 1.332 | PEDESTRIAN | 0 | ... | -0.000261 | 65.637374 | PEDESTRIAN | 0 | 0 | 1342 | 100.0 | 110.534391 | 138.700 | 1.684016 |
| 2 | 152997182 | LINESTRING (13.06413 47.81005, 13.06413 47.810... | 270701 | 139.9335 | 0.02 | 1.912 | 4.5230 | 1.690 | CAR | 0 | ... | 0.000004 | 0.086359 | CAR | 2616 | 0 | 0 | 100.0 | 60.296137 | 270.701 | 698.199732 |
| 3 | 152997183 | LINESTRING (13.06339 47.80977, 13.06339 47.809... | 101799 | 105.4660 | 0.01 | 1.872 | 4.8670 | 1.566 | CAR | 0 | ... | 0.009214 | 119.628432 | CAR | 980 | 0 | 0 | 100.0 | 144.550702 | 101.799 | 1.208331 |
| 4 | 152997184 | LINESTRING (13.06339 47.80978, 13.06340 47.809... | 100897 | 97.4585 | 0.01 | 1.974 | 5.0000 | 1.615 | CAR | 0 | ... | 0.006307 | 111.192417 | CAR | 972 | 0 | 0 | 100.0 | 144.484505 | 100.897 | 1.299410 |
5 rows × 21 columns
In this section, we processed the data, created geodataframe from the CSV file, and calculated summary parameters for further analysis.
General Descriptive Satistics#
This section is about exploring object types, there change and summary statisctics
0. Object Types#
Let’s look at the destribution of objects based on type
# Function to standardize each string by sorting the items
def standardize_string(s):
items = s.split(', ')
sorted_items = sorted(items)
return ', '.join(sorted_items)
# Apply the function to the 'unique_object_types' column
lines_gdf_stat['unique_object_types_sorted'] = lines_gdf_stat['unique_object_types'].apply(standardize_string)
# Calculate counts and sort values by frequency in descending order
counts = lines_gdf_stat['unique_object_types_sorted'].value_counts().sort_values(ascending=False)
# Create a palette of different colors
palette = sns.color_palette("husl", len(counts))
# Plot the histogram with sorted values and different colors for each bin
plt.figure(figsize=(10, 6))
ax = sns.barplot(x=counts.index, y=counts.values, palette=palette)
# Calculate the total number of observations
total = len(lines_gdf_stat)
# Annotate each bar with the count and percentage
for p, count in zip(ax.patches, counts):
height = p.get_height()
percentage = f'{height / total * 100:.2f}%'
ax.annotate(f'{int(height)}\n({percentage})',
(p.get_x() + p.get_width() / 2., height),
ha='center', va='center', xytext=(0, 10), textcoords='offset points', fontsize=10)
# Remove the plot frame (spines)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
# Set title and labels
plt.title('Distribution of Object Types')
plt.xlabel('Object Type')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
This graph is different from the one in the first chapter as it looks at object types not for individual points but for the objects as a whole, also considering changes in object type.
We can see that more than 60% of the objects are cars, around 22.4% are pedestrians, and only 1% are two-wheelers. The remaining 17% are objects that changed their type along the route for some reason. We will explore these categories in the following section.
Exploring Data with TypeChanges#
We are exploring the objects that were changing their type along the route and dividing them into 3 classes based on the share of the main type: less than 80%, between 80% and 90%, and between 90% and 100%
# Create the 'typeChanges' field
lines_gdf_stat['typeChanges'] = lines_gdf_stat['top_object_type_percentage'].apply(lambda x: 1 if x < 100 else 0)
# Create the 'typeChangesRate' field
def calculate_type_changes_rate(percentage):
if percentage == 100:
return 0
elif 90 <= percentage < 100:
return 0.1
elif 80 <= percentage < 90:
return 0.2
else:
return 1
lines_gdf_stat['typeChangesRate'] = lines_gdf_stat['top_object_type_percentage'].apply(calculate_type_changes_rate)
# Display the DataFrame with the new fields
lines_gdf_stat.head()
| object_id | geometry | total_time | avg_heading | avg_v | avg_width | avg_length | avg_height | object_type | time_class | ... | CAR | CYCLIST | PEDESTRIAN | top_object_type_percentage | route_length | total_time_s | direct_real_d_ratio | unique_object_types_sorted | typeChanges | typeChangesRate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 152997118 | LINESTRING (13.06440 47.81014, 13.06441 47.810... | 286302 | 133.7800 | 0.03 | 1.869 | 4.3870 | 1.151 | CAR | 0 | ... | 2748 | 0 | 0 | 100.0 | 102.985753 | 286.302 | 623.585910 | CAR | 0 | 0.0 |
| 1 | 152997181 | LINESTRING (13.06399 47.81006, 13.06399 47.810... | 138700 | 148.0520 | 0.77 | 0.559 | 0.5585 | 1.332 | PEDESTRIAN | 0 | ... | 0 | 0 | 1342 | 100.0 | 110.534391 | 138.700 | 1.684016 | PEDESTRIAN | 0 | 0.0 |
| 2 | 152997182 | LINESTRING (13.06413 47.81005, 13.06413 47.810... | 270701 | 139.9335 | 0.02 | 1.912 | 4.5230 | 1.690 | CAR | 0 | ... | 2616 | 0 | 0 | 100.0 | 60.296137 | 270.701 | 698.199732 | CAR | 0 | 0.0 |
| 3 | 152997183 | LINESTRING (13.06339 47.80977, 13.06339 47.809... | 101799 | 105.4660 | 0.01 | 1.872 | 4.8670 | 1.566 | CAR | 0 | ... | 980 | 0 | 0 | 100.0 | 144.550702 | 101.799 | 1.208331 | CAR | 0 | 0.0 |
| 4 | 152997184 | LINESTRING (13.06339 47.80978, 13.06340 47.809... | 100897 | 97.4585 | 0.01 | 1.974 | 5.0000 | 1.615 | CAR | 0 | ... | 972 | 0 | 0 | 100.0 | 144.484505 | 100.897 | 1.299410 | CAR | 0 | 0.0 |
5 rows × 24 columns
Filtering Data that has typeChanges along the route
lines_type_changes = lines_gdf_stat[lines_gdf_stat['typeChanges']>0]
Creating histogram to explore the distribution of objects based on the share of the main object
# Plot the histogram with a custom color
plt.figure(figsize=(10, 6))
ax = sns.histplot(lines_type_changes['top_object_type_percentage'], kde=False, color='skyblue')
# Calculate the total number of observations
total = len(lines_type_changes)
# Annotate each bar with the count and percentage
for p in ax.patches:
height = p.get_height()
percentage = f'{height / total * 100:.2f}%'
ax.annotate(f'{int(height)}\n({percentage})',
(p.get_x() + p.get_width() / 2., height),
ha='center', va='center', xytext=(0, 10), textcoords='offset points', fontsize=10)
# Remove the plot frame (spines)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
# Set title and labels
plt.title('Distribution of Top Object Type Percentage')
plt.xlabel('Top Object Type Percentage')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
For the same task creating Stacked Bar Chart, to see it from different perspective
# Assume lines_gdf_stat is your DataFrame and it has the column 'top_object_type_percentage'
# Create bins with 5% width
bins = np.arange(0, 105, 5) # Create bins from 0 to 100 with step 5
labels = [f'{i}-{i+5}%' for i in range(0, 100, 5)]
# Bin the data
lines_gdf_stat['bins'] = pd.cut(lines_gdf_stat['top_object_type_percentage'], bins=bins, labels=labels, right=False)
# Count the number of objects in each bin
bin_counts = lines_gdf_stat['bins'].value_counts().sort_index()
# Sort the bin counts in descending order for the legend
sorted_bin_counts = bin_counts.sort_values(ascending=False)
# Calculate the total number of objects
total = bin_counts.sum()
# Prepare the data for the stacked bar plot
bin_labels = sorted_bin_counts.index
counts = sorted_bin_counts.values
percentages = (counts / total) * 100
# Create an inverted gradient color palette
cmap = sns.color_palette("PuBuGn", len(counts))[::-1]
# Plot the stacked bar chart
plt.figure(figsize=(10, 6))
bars = plt.bar(['All Bins'], [sum(counts)], color='white', edgecolor='black', linewidth=1)
# Add stacked sections with gradient colors
bottom = 0
for count, percentage, label, color in zip(counts, percentages, bin_labels, cmap):
plt.bar(['All Bins'], [count], bottom=bottom, label=f'{label} ({percentage:.2f}%)', color=color)
bottom += count
# Annotate each section with white text
bottom = 0
for count, percentage in zip(counts, percentages):
plt.text(0, bottom + count / 2, f'{count}\n({percentage:.2f}%)', ha='center', va='center', fontsize=10, color='white')
bottom += count
# Remove the plot frame (spines)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
# Set title and labels
plt.title('Stacked Bar Chart of Objects in Each 5% Bin')
plt.xlabel('Top Object Type Percentage Bins')
plt.ylabel('Number of Objects')
plt.legend(title='Bins', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
More than 36% of objects that change their type, actually doesn’t change it too much and 90%+ of all points are considered as one type. 12% more have more than 90% of points with the same type.
This data help us to extract data that chages its type a lot and will be hard to analyse. We won’t filter initial data based on this variable but we will consider it as one of the parameters for data quality assesment
Let’s explore how the object types were changing (order of transition between types):
# Calculate counts and sort values by frequency in descending order
counts = lines_type_changes['unique_object_types'].value_counts().sort_values(ascending=False)
# Create a palette of different colors
palette = sns.color_palette("husl", len(counts))
# Plot the histogram with sorted values and different colors for each bin
plt.figure(figsize=(10, 6))
ax = sns.barplot(x=counts.index, y=counts.values, palette=palette)
# Calculate the total number of observations
total = len(lines_type_changes)
# Annotate each bar with the count and percentage
for p, count in zip(ax.patches, counts):
height = p.get_height()
percentage = f'{height / total * 100:.2f}%'
ax.annotate(f'{int(height)}\n({percentage})',
(p.get_x() + p.get_width() / 2., height),
ha='center', va='center', xytext=(0, 10), textcoords='offset points', fontsize=10)
# Remove the plot frame (spines)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
# Set title and labels
plt.title('Distribution of Object Types')
plt.xlabel('Object Type')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
Most of the objects changed their type from pedestrian to a car and from pedestrian to cyclist (arounf 28% for both)
We were trying to explore, may be there will be some interesting situation for heading change or velocity change based for each of this groups. But based on the average parameter, couldnt find anything
# Calculate average heading change per object
avg_heading_change = lines_type_changes.groupby('unique_object_types')['avg_heading_change'].mean()
# Calculate average heading change per object
avg_v_change = lines_type_changes.groupby('unique_object_types')['avg_v_change'].mean()
print(avg_heading_change)
print(avg_v_change)
unique_object_types
CAR, CYCLIST -0.121379
CAR, CYCLIST, PEDESTRIAN -0.289290
CAR, PEDESTRIAN 1.940362
CAR, PEDESTRIAN, CYCLIST -0.374118
CYCLIST, CAR 0.981525
CYCLIST, CAR, PEDESTRIAN 0.251706
CYCLIST, PEDESTRIAN -0.368309
CYCLIST, PEDESTRIAN, CAR -0.630435
PEDESTRIAN, CAR -0.088528
PEDESTRIAN, CAR, CYCLIST -0.495457
PEDESTRIAN, CYCLIST 0.125764
PEDESTRIAN, CYCLIST, CAR -0.379689
Name: avg_heading_change, dtype: float64
unique_object_types
CAR, CYCLIST 0.010916
CAR, CYCLIST, PEDESTRIAN -0.010641
CAR, PEDESTRIAN -0.018787
CAR, PEDESTRIAN, CYCLIST 0.003939
CYCLIST, CAR -0.010123
CYCLIST, CAR, PEDESTRIAN -0.003026
CYCLIST, PEDESTRIAN 0.002024
CYCLIST, PEDESTRIAN, CAR -0.033420
PEDESTRIAN, CAR 0.040450
PEDESTRIAN, CAR, CYCLIST 0.024540
PEDESTRIAN, CYCLIST 0.006920
PEDESTRIAN, CYCLIST, CAR 0.005258
Name: avg_v_change, dtype: float64
Additional Functions#
We have created some aditional functions that will help us explore the data for different object types:
The function to create plots for different fieldsin a dataframe:
def plot_stats(df, x_field, y_field=None, z_field=None):
# Create figure and axes for boxplot and histograms
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
# Boxplot
if y_field and z_field:
sns.boxplot(data=df[[x_field, y_field, z_field]], ax=axes[0], palette='Set3')
axes[0].set_title('Boxplot')
axes[0].set_xlabel('Variables')
axes[0].set_ylabel('Values')
elif y_field:
sns.boxplot(data=df[[x_field, y_field]], ax=axes[0], palette='Set3')
axes[0].set_title('Boxplot')
axes[0].set_xlabel('Variables')
axes[0].set_ylabel('Values')
else:
sns.boxplot(data=df[[x_field]], ax=axes[0], palette='Set3')
axes[0].set_title('Boxplot')
axes[0].set_xlabel('Variables')
axes[0].set_ylabel('Values')
# Histogram for x_field
sns.histplot(data=df, x=x_field, bins=10, kde=True, color='skyblue', ax=axes[1])
axes[1].set_title(f'Histogram of {x_field}')
axes[1].set_xlabel(f'{x_field}')
axes[1].set_ylabel('Frequency')
# Histogram for y_field (if provided)
if y_field:
sns.histplot(data=df, x=y_field, bins=10, kde=True, color='lightgreen', ax=axes[2])
axes[2].set_title(f'Histogram of {y_field}')
axes[2].set_xlabel(f'{y_field}')
axes[2].set_ylabel('Frequency')
elif z_field:
sns.histplot(data=df, x=z_field, bins=10, kde=True, color='lightgreen', ax=axes[2])
axes[2].set_title(f'Histogram of {z_field}')
axes[2].set_xlabel(f'{z_field}')
axes[2].set_ylabel('Frequency')
else:
axes[2].axis('off') # Hide the third subplot if both y_field and z_field are not provided
# Adjust layout
plt.tight_layout()
# Show plots
plt.show()
# Calculate statistics for x_field, y_field, and z_field (if provided)
stats_x = df[x_field].describe()
print(f"\nStatistics for {x_field}:")
print(stats_x)
if y_field:
stats_y = df[y_field].describe()
print(f"\nStatistics for {y_field}:")
print(stats_y)
if z_field:
stats_z = df[z_field].describe()
print(f"\nStatistics for {z_field}:")
print(stats_z)
The function to create a set of plots for specific fields in a dataframe:
def analyze_dataset(dataset):
# 1: Plots and statistics for object dimention characteristics (width, length, height)
plot_stats(dataset, 'avg_width', 'avg_length', 'avg_height')
# 2: Plots and statistics for object route_length and distance between first and last point
plot_stats(dataset, 'route_length', 'distance_first_last')
# 3: Plots and statistics for ratio between route length and distance between first and last point
plot_stats(dataset, 'direct_real_d_ratio')
# 4: Plots and statistics for total time
plot_stats(dataset, 'total_time')
# 5: Plots and statistics for average velocity
plot_stats(dataset, 'avg_v')
1. Cars#
lines_gdf_cars = lines_gdf_stat[lines_gdf_stat['object_type'] =='CAR']
lines_gdf_cars.head()
| object_id | geometry | total_time | avg_heading | avg_v | avg_width | avg_length | avg_height | object_type | time_class | ... | CYCLIST | PEDESTRIAN | top_object_type_percentage | route_length | total_time_s | direct_real_d_ratio | unique_object_types_sorted | typeChanges | typeChangesRate | bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 152997118 | LINESTRING (13.06440 47.81014, 13.06441 47.810... | 286302 | 133.7800 | 0.03 | 1.869 | 4.387 | 1.151 | CAR | 0 | ... | 0 | 0 | 100.0 | 102.985753 | 286.302 | 623.585910 | CAR | 0 | 0.0 | NaN |
| 2 | 152997182 | LINESTRING (13.06413 47.81005, 13.06413 47.810... | 270701 | 139.9335 | 0.02 | 1.912 | 4.523 | 1.690 | CAR | 0 | ... | 0 | 0 | 100.0 | 60.296137 | 270.701 | 698.199732 | CAR | 0 | 0.0 | NaN |
| 3 | 152997183 | LINESTRING (13.06339 47.80977, 13.06339 47.809... | 101799 | 105.4660 | 0.01 | 1.872 | 4.867 | 1.566 | CAR | 0 | ... | 0 | 0 | 100.0 | 144.550702 | 101.799 | 1.208331 | CAR | 0 | 0.0 | NaN |
| 4 | 152997184 | LINESTRING (13.06339 47.80978, 13.06340 47.809... | 100897 | 97.4585 | 0.01 | 1.974 | 5.000 | 1.615 | CAR | 0 | ... | 0 | 0 | 100.0 | 144.484505 | 100.897 | 1.299410 | CAR | 0 | 0.0 | NaN |
| 7 | 152997753 | LINESTRING (13.06394 47.80935, 13.06394 47.809... | 109398 | 34.2955 | 0.04 | 2.227 | 5.777 | 2.422 | CAR | 0 | ... | 0 | 0 | 100.0 | 153.183233 | 109.398 | 2.013326 | CAR | 0 | 0.0 | NaN |
5 rows × 25 columns
СARS DIMENTIONS (width and height, length)#
analyze_dataset(lines_gdf_cars)
Statistics for avg_width:
count 4178.000000
mean 1.899717
std 0.577520
min 0.054000
25% 1.854250
50% 1.923500
75% 1.989000
max 18.495000
Name: avg_width, dtype: float64
Statistics for avg_length:
count 4178.000000
mean 4.008679
std 2.248963
min 0.009000
25% 4.023000
50% 4.395000
75% 4.664000
max 18.706000
Name: avg_length, dtype: float64
Statistics for avg_height:
count 4178.000000
mean 1.136760
std 0.694165
min 0.004000
25% 0.636625
50% 1.145000
75% 1.425000
max 3.071000
Name: avg_height, dtype: float64
Statistics for route_length:
count 4178.000000
mean 75.322052
std 61.633961
min 0.000000
25% 21.538840
50% 77.358236
75% 112.704000
max 784.263218
Name: route_length, dtype: float64
Statistics for distance_first_last:
count 4178.000000
mean 56.380821
std 43.919954
min 0.000000
25% 9.259288
50% 58.883808
75% 93.647267
max 149.418347
Name: distance_first_last, dtype: float64
Statistics for direct_real_d_ratio:
count 4176.000000
mean inf
std NaN
min 1.000000
25% 1.055816
50% 1.156346
75% 1.455117
max inf
Name: direct_real_d_ratio, dtype: float64
Statistics for total_time:
count 4178.000000
mean 31888.159167
std 44217.103261
min 91.000000
25% 5098.500000
50% 14200.000000
75% 51098.000000
max 975498.000000
Name: total_time, dtype: float64
Statistics for avg_v:
count 4178.000000
mean 3.500172
std 3.425574
min 0.000000
25% 0.120000
50% 2.662500
75% 6.160000
max 14.745000
Name: avg_v, dtype: float64
Based on this cahracteristics we can distinguish statistical outliers and use it as a data quality metrics
def detect_outliers(df, columns):
# Create a copy of the DataFrame to avoid modifying the original
df_outliers = df.copy()
for col in columns:
# Calculate Q1 (25th percentile) and Q3 (75th percentile) for the column
Q1 = df_outliers[col].quantile(0.25)
Q3 = df_outliers[col].quantile(0.75)
# Calculate IQR (Interquartile Range)
IQR = Q3 - Q1
# Determine outlier boundaries
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Create a new field indicating outliers
outlier_field = f"{col}_outlier"
df_outliers[outlier_field] = ((df_outliers[col] < lower_bound) | (df_outliers[col] > upper_bound)).astype(int)
# Calculate the sum of outliers for each row
outlier_columns = [f"{col}_outlier" for col in columns]
df_outliers['total_outliers'] = df_outliers[outlier_columns].sum(axis=1)
return df_outliers
columns_to_check = ['total_time', 'avg_v', 'avg_width', 'avg_height', 'avg_length', 'route_length', 'distance_first_last']
cars_outliers = detect_outliers(lines_gdf_cars, columns_to_check)
cars_outliers.head()
| object_id | geometry | total_time | avg_heading | avg_v | avg_width | avg_length | avg_height | object_type | time_class | ... | direction_class | avg_heading_rad | total_time_outlier | avg_v_outlier | avg_width_outlier | avg_height_outlier | avg_length_outlier | route_length_outlier | distance_first_last_outlier | total_outliers | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 152997118 | LINESTRING (13.06440 47.81014, 13.06441 47.810... | 286302 | 133.7800 | 0.03 | 1.869 | 4.387 | 1.151 | CAR | 0 | ... | 1 | 2.334901 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 1 | 152997182 | LINESTRING (13.06413 47.81005, 13.06413 47.810... | 270701 | 139.9335 | 0.02 | 1.912 | 4.523 | 1.690 | CAR | 0 | ... | 1 | 2.442300 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 2 | 152997183 | LINESTRING (13.06339 47.80977, 13.06339 47.809... | 101799 | 105.4660 | 0.01 | 1.872 | 4.867 | 1.566 | CAR | 0 | ... | 1 | 1.840729 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 152997184 | LINESTRING (13.06339 47.80978, 13.06340 47.809... | 100897 | 97.4585 | 0.01 | 1.974 | 5.000 | 1.615 | CAR | 0 | ... | 1 | 1.700972 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 152997753 | LINESTRING (13.06394 47.80935, 13.06394 47.809... | 109398 | 34.2955 | 0.04 | 2.227 | 5.777 | 2.422 | CAR | 0 | ... | 0 | 0.598569 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 2 |
5 rows × 37 columns
# Calculate the counts and percentages of each combination of time_class and total_outliers
counts = cars_outliers.groupby(['time_class', 'total_outliers']).size().reset_index(name='count')
total_counts = counts.groupby('time_class')['count'].transform('sum')
counts['percentage'] = (counts['count'] / total_counts) * 100
# Pivot the data to prepare for plotting
pivot_counts = counts.pivot(index='time_class', columns='total_outliers', values='percentage').fillna(0)
# Plotting
colors = ['skyblue', 'lightgreen', 'salmon', 'orange', 'lightcoral', 'gray'] # Define additional colors
ax = pivot_counts.plot(kind='bar', stacked=True, color=colors, figsize=(8, 6))
# Add labels and title
ax.set_title('Object Counts by time_class and total_outliers', fontsize=14)
ax.set_xlabel('time_class', fontsize=12)
ax.set_ylabel('Percentage of Objects (%)', fontsize=12)
ax.legend(title='total_outliers', fontsize=10)
# Show percentages inside each bar segment
for i in range(pivot_counts.shape[0]):
total = pivot_counts.iloc[i].sum()
cum_sum = 0
for j in range(pivot_counts.shape[1]):
value = pivot_counts.iloc[i, j]
ax.text(i, cum_sum + value / 2, f'{value:.1f}%', ha='center', va='center', color='gray', fontsize=7)
cum_sum += value
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()
2. Pedestrians#
lines_gdf_pedestrians = lines_gdf_stat[lines_gdf_stat['object_type'] =='PEDESTRIAN']
analyze_dataset(lines_gdf_pedestrians)
Statistics for avg_width:
count 1769.000000
mean 0.517953
std 0.210210
min 0.064000
25% 0.368000
50% 0.503500
75% 0.624000
max 1.491000
Name: avg_width, dtype: float64
Statistics for avg_length:
count 1769.000000
mean 0.499413
std 0.242799
min 0.092000
25% 0.315000
50% 0.470000
75% 0.595000
max 2.364000
Name: avg_length, dtype: float64
Statistics for avg_height:
count 1769.000000
mean 1.109840
std 0.360861
min 0.048500
25% 0.849000
50% 1.123500
75% 1.372500
max 2.450000
Name: avg_height, dtype: float64
Statistics for route_length:
count 1769.000000
mean 24.235552
std 44.936549
min 0.000000
25% 1.203799
50% 6.902049
75% 28.358431
max 895.087643
Name: route_length, dtype: float64
Statistics for distance_first_last:
count 1769.000000
mean 14.300289
std 25.493666
min 0.000000
25% 0.245034
50% 2.382743
75% 14.300744
max 133.101487
Name: distance_first_last, dtype: float64
Statistics for direct_real_d_ratio:
count 1760.000000
mean inf
std NaN
min 1.000000
25% 1.092678
50% 1.405990
75% 4.543949
max inf
Name: direct_real_d_ratio, dtype: float64
Statistics for total_time:
count 1769.000000
mean 29624.931034
std 56806.996871
min 93.000000
25% 2400.000000
50% 8596.000000
75% 33999.000000
max 769198.000000
Name: total_time, dtype: float64
Statistics for avg_v:
count 1769.000000
mean 0.726439
std 0.721659
min 0.000000
25% 0.070000
50% 0.600000
75% 1.300000
max 6.880000
Name: avg_v, dtype: float64
columns_to_check = ['total_time', 'avg_v', 'avg_width', 'avg_height', 'avg_length', 'route_length', 'distance_first_last']
pedestrians_outliers = detect_outliers(lines_gdf_pedestrians, columns_to_check)
pedestrians_outliers.head()
| object_id | geometry | total_time | avg_heading | avg_v | avg_width | avg_length | avg_height | object_type | time_class | ... | direction_class | avg_heading_rad | total_time_outlier | avg_v_outlier | avg_width_outlier | avg_height_outlier | avg_length_outlier | route_length_outlier | distance_first_last_outlier | total_outliers | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 152997181 | LINESTRING (13.06399 47.81006, 13.06399 47.810... | 138700 | 148.052 | 0.770 | 0.559 | 0.5585 | 1.3320 | PEDESTRIAN | 0 | ... | 1 | 2.583995 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 3 |
| 5 | 152997343 | LINESTRING (13.06429 47.80957, 13.06429 47.809... | 108398 | 228.052 | 0.100 | 0.659 | 0.6375 | 1.0935 | PEDESTRIAN | 0 | ... | 2 | 3.980258 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 6 | 152997466 | LINESTRING (13.06442 47.80944, 13.06441 47.809... | 115007 | 233.052 | 0.070 | 0.459 | 0.5000 | 1.4510 | PEDESTRIAN | 0 | ... | 3 | 4.067525 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 3 |
| 8 | 152997836 | LINESTRING (13.06453 47.80964, 13.06453 47.809... | 109009 | 134.466 | 0.020 | 0.574 | 0.6820 | 1.2320 | PEDESTRIAN | 0 | ... | 1 | 2.346874 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 3 |
| 9 | 152997959 | LINESTRING (13.06450 47.80986, 13.06451 47.809... | 116001 | 223.052 | 0.585 | 0.448 | 0.3850 | 1.1940 | PEDESTRIAN | 0 | ... | 2 | 3.892992 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 2 |
5 rows × 35 columns
# Calculate the counts and percentages of each combination of time_class and total_outliers
counts = pedestrians_outliers.groupby(['time_class', 'total_outliers']).size().reset_index(name='count')
total_counts = counts.groupby('time_class')['count'].transform('sum')
counts['percentage'] = (counts['count'] / total_counts) * 100
# Pivot the data to prepare for plotting
pivot_counts = counts.pivot(index='time_class', columns='total_outliers', values='percentage').fillna(0)
# Plotting
colors = ['skyblue', 'lightgreen', 'salmon', 'orange', 'lightcoral'] # Define additional colors
ax = pivot_counts.plot(kind='bar', stacked=True, color=colors, figsize=(8, 6))
# Add labels and title
ax.set_title('Object Counts by time_class and total_outliers', fontsize=14)
ax.set_xlabel('time_class', fontsize=12)
ax.set_ylabel('Percentage of Objects (%)', fontsize=12)
ax.legend(title='total_outliers', fontsize=10)
# Show percentages inside each bar segment
for i in range(pivot_counts.shape[0]):
total = pivot_counts.iloc[i].sum()
cum_sum = 0
for j in range(pivot_counts.shape[1]):
value = pivot_counts.iloc[i, j]
ax.text(i, cum_sum + value / 2, f'{value:.1f}%', ha='center', va='center', color='gray', fontsize=7)
cum_sum += value
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()
3. Two-Wheelers#
lines_gdf_two_wheelers = lines_gdf_stat[lines_gdf_stat['object_type'] =='CYCLIST']
analyze_dataset(lines_gdf_two_wheelers)
Statistics for avg_width:
count 250.000000
mean 0.632556
std 0.207776
min 0.112500
25% 0.541625
50% 0.623000
75% 0.697250
max 1.714000
Name: avg_width, dtype: float64
Statistics for avg_length:
count 250.000000
mean 1.252888
std 0.498308
min 0.076000
25% 0.898750
50% 1.250000
75% 1.629125
max 3.735000
Name: avg_length, dtype: float64
Statistics for avg_height:
count 250.000000
mean 1.285610
std 0.310616
min 0.019000
25% 1.155250
50% 1.344250
75% 1.445750
max 2.803000
Name: avg_height, dtype: float64
Statistics for route_length:
count 250.000000
mean 76.275553
std 58.753900
min 0.022234
25% 22.881110
50% 84.569250
75% 113.700210
max 559.920694
Name: route_length, dtype: float64
Statistics for distance_first_last:
count 250.000000
mean 63.831372
std 43.892981
min 0.000000
25% 18.486127
50% 75.049915
75% 99.808464
max 135.982695
Name: distance_first_last, dtype: float64
Statistics for direct_real_d_ratio:
count 250.000000
mean inf
std NaN
min 1.000000
25% 1.026796
50% 1.087646
75% 1.246673
max inf
Name: direct_real_d_ratio, dtype: float64
Statistics for total_time:
count 250.000000
mean 31482.288000
std 33844.385272
min 94.000000
25% 8623.250000
50% 19600.000000
75% 41577.000000
max 216099.000000
Name: total_time, dtype: float64
Statistics for avg_v:
count 250.000000
mean 3.245340
std 2.823628
min 0.000000
25% 1.022500
50% 2.695000
75% 5.335000
max 13.430000
Name: avg_v, dtype: float64
columns_to_check = ['total_time', 'avg_v', 'avg_width', 'avg_height', 'avg_length', 'route_length', 'distance_first_last']
two_wheelers_outliers = detect_outliers(lines_gdf_two_wheelers, columns_to_check)
# Calculate the counts and percentages of each combination of time_class and total_outliers
counts = two_wheelers_outliers.groupby(['time_class', 'total_outliers']).size().reset_index(name='count')
total_counts = counts.groupby('time_class')['count'].transform('sum')
counts['percentage'] = (counts['count'] / total_counts) * 100
# Pivot the data to prepare for plotting
pivot_counts = counts.pivot(index='time_class', columns='total_outliers', values='percentage').fillna(0)
# Plotting
colors = ['skyblue', 'lightgreen', 'salmon', 'orange', 'lightcoral'] # Define additional colors
ax = pivot_counts.plot(kind='bar', stacked=True, color=colors, figsize=(8, 6))
# Add labels and title
ax.set_title('Object Counts by time_class and total_outliers', fontsize=14)
ax.set_xlabel('time_class', fontsize=12)
ax.set_ylabel('Percentage of Objects (%)', fontsize=12)
ax.legend(title='total_outliers', fontsize=10)
# Show percentages inside each bar segment
for i in range(pivot_counts.shape[0]):
total = pivot_counts.iloc[i].sum()
cum_sum = 0
for j in range(pivot_counts.shape[1]):
value = pivot_counts.iloc[i, j]
ax.text(i, cum_sum + value / 2, f'{value:.1f}%', ha='center', va='center', color='gray', fontsize=7)
cum_sum += value
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()
Quality Assesment#
Defining directions and calculating number of cars in each direction. Option 1#
Creating Starting points#
starting_points = points_gdf.groupby('object_id').first().reset_index().set_crs(epsg=4326)
starting_points.head()
| object_id | timestamp | heading | height | width | length | v | tracking_status | object_type | lon | lat | time_class | geometry | distance_first_last | heading_change | avg_heading_change | v_change | avg_v_change | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 152997118 | 1712062811083 | 132.072 | 0.735 | 1.942 | 4.387 | 0.03 | TRACKING | CAR | 13.064405 | 47.810136 | 0 | POINT (13.06440 47.81014) | 0.165151 | 0.0 | 0.065886 | 0.0 | 0.000004 |
| 1 | 152997181 | 1712062820184 | 198.052 | 1.458 | 0.641 | 0.366 | 1.10 | TRACKING | PEDESTRIAN | 13.063991 | 47.810058 | 0 | POINT (13.06399 47.81006) | 65.637374 | 0.0 | -0.048435 | 0.0 | -0.000261 |
| 2 | 152997182 | 1712062820083 | 319.543 | 1.690 | 1.916 | 4.555 | 0.02 | TRACKING | CAR | 13.064130 | 47.810046 | 0 | POINT (13.06413 47.81005) | 0.086359 | 0.0 | -0.068684 | 0.0 | 0.000004 |
| 3 | 152997183 | 1712062822387 | 103.052 | 0.539 | 1.923 | 4.550 | 1.92 | TRACKING | CAR | 13.063387 | 47.809774 | 0 | POINT (13.06339 47.80977) | 119.628432 | 0.0 | -0.081633 | 0.0 | 0.009214 |
| 4 | 152997184 | 1712062824786 | 85.395 | 1.207 | 1.981 | 4.367 | 2.18 | TRACKING | CAR | 13.063389 | 47.809776 | 0 | POINT (13.06339 47.80978) | 111.192417 | 0.0 | -0.053851 | 0.0 | 0.006307 |
starting_points.explore(tiles='Esri.WorldImagery')
ending_points = points_gdf.groupby('object_id').last().reset_index().set_crs(epsg=4326)
ending_points.head()
| object_id | timestamp | heading | height | width | length | v | tracking_status | object_type | lon | lat | time_class | geometry | distance_first_last | heading_change | avg_heading_change | v_change | avg_v_change | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 152997118 | 1712063097385 | 313.127 | 0.348 | 1.955 | 4.500 | 0.04 | TRACKING | CAR | 13.064403 | 47.810137 | 0 | POINT (13.06440 47.81014) | 0.165151 | -0.696 | 0.065886 | 0.03 | 0.000004 |
| 1 | 152997181 | 1712062958884 | 133.052 | 0.528 | 0.645 | 0.209 | 0.75 | TRACKING | PEDESTRIAN | 13.064302 | 47.809506 | 0 | POINT (13.06430 47.80951) | 65.637374 | 75.000 | -0.048435 | 0.05 | -0.000261 |
| 2 | 152997182 | 1712063090784 | 139.865 | 0.119 | 1.883 | 4.461 | 0.03 | TRACKING | CAR | 13.064130 | 47.810045 | 0 | POINT (13.06413 47.81005) | 0.086359 | 0.126 | -0.068684 | 0.00 | 0.000004 |
| 3 | 152997183 | 1712062924186 | 23.052 | 0.017 | 1.880 | 4.287 | 10.95 | TRACKING | CAR | 13.064855 | 47.810200 | 0 | POINT (13.06485 47.81020) | 119.628432 | -15.000 | -0.081633 | -0.40 | 0.009214 |
| 4 | 152997184 | 1712062925683 | 33.052 | 0.596 | 1.959 | 4.330 | 8.31 | TRACKING | CAR | 13.064773 | 47.810138 | 0 | POINT (13.06477 47.81014) | 111.192417 | -9.464 | -0.053851 | -2.54 | 0.006307 |
ending_points.explore(tiles='Esri.WorldImagery')
CARS Directions#
def create_regular_grid(gdf, square_size):
#calculating UTM zone for the data
utm_zone = gdf.estimate_utm_crs()
#перепроецируем набор данных
gdf = gdf.to_crs(utm_zone)
minX, minY, maxX, maxY = gdf.total_bounds
grid_cells = []
x, y = minX, minY
while y <= maxY:
while x <= maxX:
geom = Polygon([(x, y), (x, y + square_size), (x + square_size, y + square_size), (x + square_size, y), (x, y)])
grid_cells.append(geom)
x += square_size
x = minX
y += square_size
fishnet = gpd.GeoDataFrame(geometry=grid_cells, crs=utm_zone)
fishnet['grid_id'] = range(len(grid_cells))
fishnet = fishnet.to_crs(epsg=4326)
return fishnet
startingPointsCars = starting_points[starting_points['object_type']=="CAR"]
endingPointsCars = ending_points[ending_points['object_type']=="CAR"]
startEndPointsCars = pd.concat([startingPointsCars, endingPointsCars], ignore_index=True)
grid = create_regular_grid(startEndPointsCars, 7.5)
grid.explore(tiles='Esri.WorldImagery')
points_grid = gpd.sjoin(startEndPointsCars, grid, predicate='within')
points_count = points_grid.groupby('grid_id').size().reset_index(name='points_count')
grid_with_count = grid.merge(points_count, on='grid_id', how='left')
grid_with_count.head()
| geometry | grid_id | points_count | |
|---|---|---|---|
| 0 | POLYGON ((13.06309 47.80930, 13.06309 47.80937... | 0 | NaN |
| 1 | POLYGON ((13.06319 47.80930, 13.06319 47.80937... | 1 | NaN |
| 2 | POLYGON ((13.06329 47.80930, 13.06329 47.80937... | 2 | NaN |
| 3 | POLYGON ((13.06339 47.80930, 13.06339 47.80937... | 3 | NaN |
| 4 | POLYGON ((13.06349 47.80931, 13.06349 47.80937... | 4 | NaN |
grid_main_clusters = grid_with_count[grid_with_count['points_count']>25]
grid_with_count.explore(column='points_count', tiles='Esri.WorldImagery')
Create Clusters#
Function to find clusters of polygons based on spatial relationships (touches or intersects:
def find_polygon_clusters(gdf):
# Ensure geometries are valid
gdf = gdf[gdf.is_valid]
# Initialize a dictionary to store cluster names
cluster_names = {}
# Initialize an empty list to store groups of neighboring polygons
polygon_groups = []
# Counter for cluster names
cluster_counter = 1
# Loop through each polygon and find its neighbors
for idx, polygon in gdf.iterrows():
if idx not in cluster_names:
cluster_names[idx] = f'Cluster{cluster_counter}'
cluster_counter += 1
# Find neighboring polygons (e.g., polygons that touch or intersect)
neighbors = gdf[gdf.geometry.touches(polygon['geometry']) | gdf.geometry.intersects(polygon['geometry'])]
# Check if the polygon and its neighbors form a MultiPolygon
if len(neighbors) > 0:
merged_geometry = MultiPolygon([polygon['geometry']] + list(neighbors['geometry']))
polygon_groups.append(merged_geometry)
# Assign the same cluster name to all neighbors
for neighbor_idx in neighbors.index:
if neighbor_idx not in cluster_names:
cluster_names[neighbor_idx] = cluster_names[idx]
# Assign cluster names to the GeoDataFrame
gdf['cluster'] = gdf.index.map(cluster_names)
return gdf
cars_clusters = find_polygon_clusters(grid_main_clusters)
cars_clusters .explore(column='cluster', tiles='Esri.WorldImagery')
startingPointsCars_CLUSTER = gpd.sjoin(startingPointsCars.to_crs(cars_clusters .crs), cars_clusters , predicate='within')
endingPointsCars_CLUSTER = gpd.sjoin(endingPointsCars.to_crs(cars_clusters .crs), cars_clusters , predicate='within')
startingPointsCars_CLUSTER['start_cluster'] = startingPointsCars_CLUSTER['cluster']
endingPointsCars_CLUSTER['end_cluster'] = endingPointsCars_CLUSTER['cluster']
startingPointsCars_CLUSTER.explore(column="cluster",tiles='Esri.WorldImagery')
endingPointsCars_CLUSTER.explore(column="cluster", tiles='Esri.WorldImagery')
Getting Directions
lines_gdf_cars = lines_gdf_cars.merge(startingPointsCars_CLUSTER[['object_id','start_cluster']], on="object_id", how="left")
lines_gdf_cars = lines_gdf_cars.merge(endingPointsCars_CLUSTER[['object_id','end_cluster']], on="object_id", how="left")
lines_gdf_cars.groupby(['start_cluster', 'end_cluster']).size()
start_cluster end_cluster
Cluster1 Cluster1 36
Cluster2 2
Cluster3 14
Cluster4 140
Cluster2 Cluster1 118
Cluster2 66
Cluster3 383
Cluster4 114
Cluster3 Cluster1 31
Cluster2 490
Cluster3 298
Cluster4 260
Cluster4 Cluster1 118
Cluster2 249
Cluster3 248
Cluster4 922
Cluster5 Cluster5 18
dtype: int64
Defining directions and calculating number of cars in each direction. Option 2#
The previous approach unfortunalty can’t be applied for pedestrians and two-wheelers as their routes are less clear distingishable and not strait. So, for pedestrians and two-wheelers we will define their directions baes on the intersection with crosslines created by us in QGIS
Defining directions and calculating number of cars in each direction. Option 3#
This approach is very easy to implement but it is less precise in comparison to the previous two. We have started with it and want to show it anyway as it was a part of our work and it still shows some overall information about directions
Calculating amount of objects in different directions for cars#
cars_heading_breaks = jenkspy.jenks_breaks(lines_gdf_cars['avg_heading'], 5)
# Создаем столбец для класса направления на основе разрывов
lines_gdf_cars['direction_class'] = pd.cut(lines_gdf_cars['avg_heading'], bins=cars_heading_breaks, labels=False, include_lowest=True)
# Считаем количество объектов в каждом классе направления
direction_class_counts = lines_gdf_cars['direction_class'].value_counts().sort_index()
print("Breaks:", cars_heading_breaks)
print("Direction Class Counts:\n", direction_class_counts)
Breaks: [8.052, 70.552, 158.05200000000002, 236.827, 276.276, 353.052]
Direction Class Counts:
0 642
1 1072
2 1361
3 641
4 462
Name: direction_class, dtype: int64
# Creating Histogram
sns.histplot(data=lines_gdf_cars, x='avg_heading', hue='direction_class', bins=20, alpha=0.8, palette='BuPu', edgecolor='lightgray')
# Removing graph border
sns.despine()
# Adding titlle and axis names
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram, bins colored by class')
# Showing graph
plt.show()
# Convert angles from degrees to radians for the polar plot
lines_gdf_cars['avg_heading_rad'] = np.deg2rad(lines_gdf_cars['avg_heading'])
# Create the polar plot
plt.figure(figsize=(10, 6))
ax = plt.subplot(111, projection='polar')
# Configure the polar plot: 0 degrees at the top and clockwise direction
ax.set_theta_zero_location('N')
ax.set_theta_direction(-1)
# Define the number of bins
num_bins = 8
bins = np.linspace(0, 2 * np.pi, num_bins + 1)
# Count the number of objects in each bin
hist, edges = np.histogram(lines_gdf_cars['avg_heading_rad'], bins=bins)
# Plot the histogram on the polar plot
bars = ax.bar(edges[:-1], hist, width=np.diff(edges), align='edge', edgecolor='black')
# Set angle labels in degrees
ax.set_xticks(np.deg2rad(np.arange(0, 360, 45)))
ax.set_xticklabels(['0°', '45°', '90°', '135°', '180°', '225°', '270°', '315°'])
# Add title
plt.title('Distribution of Objects by Avg Heading')
# Display the plot
plt.show()
Calculating amount of objects in different directions for pedestrians#
pedestrians_heading_breaks = jenkspy.jenks_breaks(lines_gdf_pedestrians['avg_heading'], 5)
# Создаем столбец для класса направления на основе разрывов
lines_gdf_pedestrians['direction_class'] = pd.cut(lines_gdf_pedestrians['avg_heading'], bins=pedestrians_heading_breaks, labels=False, include_lowest=True)
# Считаем количество объектов в каждом классе направления
direction_class_counts = lines_gdf_pedestrians['direction_class'].value_counts().sort_index()
print("Breaks:", pedestrians_heading_breaks)
print("Direction Class Counts:\n", direction_class_counts)
Breaks: [3.052, 89.59200000000001, 160.552, 228.439, 285.552, 358.052]
Direction Class Counts:
0 388
1 371
2 411
3 358
4 241
Name: direction_class, dtype: int64
/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/geopandas/geodataframe.py:1538: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
super().__setitem__(key, value)
# Creating Histogram
sns.histplot(data=lines_gdf_pedestrians, x='avg_heading', hue='direction_class', bins=20, alpha=0.8, palette='BuPu', edgecolor='lightgray')
# Removing graph border
sns.despine()
# Adding titlle and axis names
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram, bins colored by class')
# Showing graph
plt.show()
# Convert angles from degrees to radians for the polar plot
lines_gdf_pedestrians['avg_heading_rad'] = np.deg2rad(lines_gdf_pedestrians['avg_heading'])
# Create the polar plot
plt.figure(figsize=(10, 6))
ax = plt.subplot(111, projection='polar')
# Configure the polar plot: 0 degrees at the top and clockwise direction
ax.set_theta_zero_location('N')
ax.set_theta_direction(-1)
# Define the number of bins
num_bins = 8
bins = np.linspace(0, 2 * np.pi, num_bins + 1)
# Count the number of objects in each bin
hist, edges = np.histogram(lines_gdf_pedestrians['avg_heading_rad'], bins=bins)
# Plot the histogram on the polar plot
bars = ax.bar(edges[:-1], hist, width=np.diff(edges), align='edge', edgecolor='black')
# Set angle labels in degrees
ax.set_xticks(np.deg2rad(np.arange(0, 360, 45)))
ax.set_xticklabels(['0°', '45°', '90°', '135°', '180°', '225°', '270°', '315°'])
# Add title
plt.title('Distribution of Objects by Avg Heading')
# Display the plot
plt.show()
/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/geopandas/geodataframe.py:1538: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
super().__setitem__(key, value)
Calculating amount of objects in different directions for two-wheelers#
two_wheelers_heading_breaks = jenkspy.jenks_breaks(lines_gdf_two_wheelers['avg_heading'], 5)
# Создаем столбец для класса направления на основе разрывов
lines_gdf_two_wheelers['direction_class'] = pd.cut(lines_gdf_two_wheelers['avg_heading'], bins=two_wheelers_heading_breaks, labels=False, include_lowest=True)
# Считаем количество объектов в каждом классе направления
direction_class_counts = lines_gdf_two_wheelers['direction_class'].value_counts().sort_index()
print("Breaks:", cars_heading_breaks)
print("Direction Class Counts:\n", direction_class_counts)
# Creating Histogram
sns.histplot(data=lines_gdf_cars, x='avg_heading', hue='direction_class', bins=20, alpha=0.8, palette='BuPu', edgecolor='lightgray')
# Removing graph border
sns.despine()
# Adding titlle and axis names
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram, bins colored by class')
# Showing graph
Text(0.5, 1.0, 'Histogram, bins colored by class')
# Convert angles from degrees to radians for the polar plot
lines_gdf_cars['avg_heading_rad'] = np.deg2rad(lines_gdf_cars['avg_heading'])
# Create the polar plot
plt.figure(figsize=(10, 6))
ax = plt.subplot(111, projection='polar')
# Configure the polar plot: 0 degrees at the top and clockwise direction
ax.set_theta_zero_location('N')
ax.set_theta_direction(-1)
# Define the number of bins
num_bins = 8
bins = np.linspace(0, 2 * np.pi, num_bins + 1)
# Count the number of objects in each bin
hist, edges = np.histogram(lines_gdf_cars['avg_heading_rad'], bins=bins)
# Plot the histogram on the polar plot
bars = ax.bar(edges[:-1], hist, width=np.diff(edges), align='edge', edgecolor='black')
# Set angle labels in degrees
ax.set_xticks(np.deg2rad(np.arange(0, 360, 45)))
ax.set_xticklabels(['0°', '45°', '90°', '135°', '180°', '225°', '270°', '315°'])
# Add title
plt.title('Distribution of Objects by Avg Heading')
# Display the plot
plt.show()